{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n",
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 0.453 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.utils import to_categorical\n",
    "from keras.models import Model\n",
    "from keras.layers import Embedding,Input,Dense,Conv1D,MaxPooling1D,Dense,Flatten,Lambda,LSTM\n",
    "from keras import backend as K\n",
    "import tensorflow as tf\n",
    "import matplotlib as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import jieba\n",
    "\n",
    "jieba.load_userdict(\"70000-dict.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 2125056 texts.\n"
     ]
    }
   ],
   "source": [
    "def divide_word(df,column='Comment'):\n",
    "    seg_list = jieba.cut(df[column], cut_all=False)\n",
    "    return \" \".join(seg_list)\n",
    "\n",
    "data = pd.read_csv('DMSC.csv')\n",
    "data = data[['Comment', 'Star']].dropna().copy()\n",
    "\n",
    "data['Comment'] = data.apply(divide_word,axis = 1)\n",
    "\n",
    "texts = data['Comment'].values\n",
    "labels = data['Star'].values\n",
    "labels_index = list(np.sort(data['Star'].unique()))\n",
    "\n",
    "\n",
    "print('Found %s texts.' % len(texts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 270667 unique tokens.\n",
      "Shape of data tensor: (2125056, 80)\n",
      "Shape of label tensor: (2125056, 6)\n"
     ]
    }
   ],
   "source": [
    "MAX_SEQUENCE_LENGTH = 80 #样本最长369，可以测试更长的数据\n",
    "VALIDATION_SPLIT = 0.05 #数据量很大，验证集可以不用20%\n",
    "EMBEDDING_DIM = 200 #腾讯的维度，不能修改\n",
    "EPOCHS = 1\n",
    "BATCH_SIZE = 512\n",
    "\n",
    "tokenizer = Tokenizer()\n",
    "tokenizer.fit_on_texts(texts)\n",
    "sequences = tokenizer.texts_to_sequences(texts)\n",
    "\n",
    "word_index = tokenizer.word_index\n",
    "print('Found %s unique tokens.' % len(word_index))\n",
    "\n",
    "data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)\n",
    "\n",
    "labels = to_categorical(np.asarray(labels))\n",
    "print('Shape of data tensor:', data.shape)\n",
    "print('Shape of label tensor:', labels.shape)\n",
    "\n",
    "# split the data into a training set and a validation set\n",
    "indices = np.arange(data.shape[0])\n",
    "np.random.shuffle(indices)\n",
    "data = data[indices]\n",
    "labels = labels[indices]\n",
    "nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])\n",
    "\n",
    "x_train = data[:-nb_validation_samples]\n",
    "y_train = labels[:-nb_validation_samples]\n",
    "x_val = data[-nb_validation_samples:]\n",
    "y_val = labels[-nb_validation_samples:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 70000 word vectors.\n"
     ]
    }
   ],
   "source": [
    "embeddings_index = {}\n",
    "with open('70000-small.txt','r') as f:\n",
    "    for i,line in enumerate(f):\n",
    "        if i == 0:\n",
    "            continue\n",
    "        values = line.split()\n",
    "        word = values[0]\n",
    "        coefs = np.asarray(values[1:], dtype='float32')\n",
    "        embeddings_index[word] = coefs\n",
    "print('Found %s word vectors.' % len(embeddings_index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))\n",
    "for word, i in word_index.items():\n",
    "    embedding_vector = embeddings_index.get(word)\n",
    "    if embedding_vector is not None:\n",
    "        # words not found in embedding index will be all-zeros.\n",
    "        embedding_matrix[i] = embedding_vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_layer = Embedding(len(word_index) + 1,\n",
    "                            EMBEDDING_DIM,\n",
    "                            weights=[embedding_matrix],\n",
    "                            input_length=MAX_SEQUENCE_LENGTH,\n",
    "                            trainable=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "input_4 (InputLayer)         (None, 80)                0         \n",
      "_________________________________________________________________\n",
      "embedding_1 (Embedding)      (None, 80, 200)           54133600  \n",
      "_________________________________________________________________\n",
      "lstm_4 (LSTM)                (None, 128)               168448    \n",
      "_________________________________________________________________\n",
      "dense_7 (Dense)              (None, 128)               16512     \n",
      "_________________________________________________________________\n",
      "dense_8 (Dense)              (None, 6)                 774       \n",
      "=================================================================\n",
      "Total params: 54,319,334\n",
      "Trainable params: 185,734\n",
      "Non-trainable params: 54,133,600\n",
      "_________________________________________________________________\n",
      "Train on 2018804 samples, validate on 106252 samples\n",
      "Epoch 1/1\n",
      "2018804/2018804 [==============================] - 2054s 1ms/step - loss: 1.2209 - acc: 0.4596 - val_loss: 1.1063 - val_acc: 0.5131\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7fbf66c3f748>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
    "embedded_sequences = embedding_layer(sequence_input)\n",
    "x = LSTM(128, dropout=0.5)(embedded_sequences)\n",
    "x = Dense(128, activation='relu')(x)\n",
    "preds = Dense(len(labels_index)+1, activation='softmax')(x)\n",
    "\n",
    "model = Model(sequence_input, preds)\n",
    "model.summary()\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer='rmsprop',\n",
    "              metrics=['acc'])\n",
    "\n",
    "# happy learning!\n",
    "model.fit(x_train, y_train, validation_data=(x_val, y_val),\n",
    "          epochs=EPOCHS, batch_size=BATCH_SIZE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
